Grammar of Graphics and Ggplot2

An influential book by Leland Wilkinson with this title lays out a theoretical framework to systematically construct quantitative graphics. Hadley Wickham extended these and related ideas and implemented them in the ggplot2 package. If you are interested in more details, this paper would be a great starting point: http://vita.had.co.nz/papers/layered-grammar.pdf.

Note that these ideas deal only with static graphics.

At a very high level, this is what you would do with ggplot:

  • create a plot with the ggplot() call
  • add layers using various geom_*() or stat_*() calls. Some properties (e.g. position) of these layers can be set in these calls.
  • specify “aesthetics” using aes() for the full plot or per-layer. This deals with selecting variables to plot, colours, sizes, shapes of the plot elements etc. Aesthetics can be “mapped” to variables in the data.
  • optionally specify faceting
  • optionally adjust the scale, the axes and the legend
  • optionally customise the theme

Resources:

For a quick overview of ggplot, see https://r4ds.had.co.nz/data-visualisation.html.

To learn and practice more, the Rmd files here are great: https://github.com/hadley/ggplot2-book


We load the data saved by the tidy.Rmd file (part-2 of the tutorial).

library(tidyverse)

data <- read_csv("data/ua_populations.csv")
sex_ratio_change <- read_csv("data/sex_ratio_change.csv")

Create a tidy population data set

pop_data <- data %>% 
  rename(total = population, male = pop_male, female = pop_female) %>% 
  select(-c(area, pop_change, pop_change_percent)) %>% 
  gather(sex, population, total, male, female)

pop_data
## # A tibble: 8,331 x 5
##    ua_no ua                   year sex   population
##    <dbl> <chr>               <dbl> <chr>      <dbl>
##  1     1 Greater Mumbai U.A.  1961 total    4515495
##  2     1 Greater Mumbai U.A.  1971 total    6596370
##  3     1 Greater Mumbai U.A.  1981 total    9421962
##  4     1 Greater Mumbai U.A.  1991 total   12596243
##  5     1 Greater Mumbai U.A.  2001 total   16434386
##  6     1 Greater Mumbai U.A.  2011 total   18394912
##  7     2 Delhi  U.A.          1961 total    2359408
##  8     2 Delhi  U.A.          1971 total    3647023
##  9     2 Delhi  U.A.          1981 total    5760811
## 10     2 Delhi  U.A.          1991 total    8471625
## # ... with 8,321 more rows

Male and Female Population Distributions

pop_data %>% 
  filter(sex != "total" & year == 1961) %>% 
  ggplot(aes(x = log(population), fill = sex, alpha = 0.5)) +
    geom_density(position = "identity")


Try changing geom_density() to geom_histogram(). Also try the position = "dodge" option. You could also add facet_wrap() to get the distributions for the years separately.

Total Population Distributions over Time

library(ggridges)

pop_data %>% 
  filter(sex == "total") %>% 
  ggplot(aes(x = log10(population), 
             y = factor(year, levels = rev(levels(factor(year)))))) +
    stat_density_ridges() 

Line graphs - change over time

Plotting for one city - total population over time

pop_data %>%
  filter(ua == "Vapi ^" & sex == "total") %>% 
  ggplot(aes(x = year, y = population)) +
    geom_line()

Facet wrap

gap_increasing <- sex_ratio_change %>% 
  top_n(20, sex_ratio_slope)

gap_decreasing <- sex_ratio_change %>% 
  top_n(-20, sex_ratio_slope)

p <- pop_data %>%
  semi_join(gap_decreasing, by = "ua_no") %>% 
  filter(sex == "total") %>% 
  ggplot(aes(x = year, y = population)) +
    geom_line() +
    facet_wrap(~ua, scales = "free_y")

p + theme(axis.text.x = element_text(angle = 45, hjust = 1)) 

Multiple cities - male vs. female

Disparity closing

pop_data %>%
  semi_join(gap_decreasing, by = "ua_no") %>% 
  filter(sex != "total") %>% 
  ggplot() +
    geom_line(aes(x = year, y = population, colour = sex)) + 
    facet_wrap(~ua, scales = "free_y", ncol = 4, strip.position = "bottom") +
    theme(axis.text.x = element_text(angle = 45, hjust = 1)) 

Disparity increasing

pop_data %>%
  semi_join(gap_increasing, by = "ua_no") %>% 
  filter(sex != "total") %>% 
  ggplot() +
    geom_line(aes(x = year, y = population, colour = sex)) + 
    facet_wrap(~ua, scales = "free_y", ncol = 4, strip.position = "bottom") +
    theme(axis.text.x = element_text(angle = 45, hjust = 1)) 

2011 population: bar plot

pop_data %>% 
  filter(year == 2011 & sex == "total") %>% 
  arrange(desc(population)) %>% 
  top_n(30) %>% 
  ggplot() +
    geom_bar(aes(x = reorder(factor(ua), population), y = population), stat = "identity") +
    coord_flip() +
    xlab("City") +
    scale_y_continuous(label = scales::comma)

Scatter plot

cities_scatter <- data %>% 
  filter(year == 2011) %>% 
  mutate(
    sex_ratio = pop_male / pop_female,
    pop_density = population / area     
  ) %>%
  ggplot(aes(x = log10(population), y = pop_density, label = ua)) +
    geom_point(aes(size = log10(population),
                   colour = sex_ratio,
                   alpha = 0.4)) +
    scale_color_gradient(low = "green", high = "red") +
    scale_y_reverse()

cities_scatter

With labels

library(ggrepel)

cities_scatter_labelled <- cities_scatter +
  geom_label_repel(aes(label = ifelse(sex_ratio > quantile(sex_ratio, 0.98, na.rm = TRUE) |
                                        sex_ratio < quantile(sex_ratio, 0.02, na.rm = TRUE) |
                                        pop_density > quantile(pop_density, 0.98, na.rm = TRUE) |
                                        pop_density < quantile(pop_density, 0.02, na.rm = TRUE), 
                                      ua, '')))

cities_scatter_labelled

Interactive Scatter plot using plotly

plotly::ggplotly(cities_scatter)

Scatter Plot Animation

Let’s try something fancy!

#library(animation)

mega_cities <- data %>% 
  filter(year == 2011 & population > 3000000 & ua_no != 6) %>% 
  select(ua_no)
  
draw_scatter <- function(y) {
  p <- data %>% 
    filter(year == y) %>% 
    semi_join(mega_cities, by = "ua_no") %>% 
    mutate(
      sex_ratio = pop_male / pop_female,
      pop_density = population / area     
    ) %>%
    ggplot(aes(x = sex_ratio, y = pop_density, label = ua)) +
      geom_point(aes(size = sqrt(population) / 300,
                     colour = ua,
                     alpha = 0.5)) +
      xlim(.8, 1.8) +
      ylim(25000, 0) +
      guides(size = FALSE, alpha = FALSE) + 
      scale_size_identity()
  
  ggsave(paste0("images/anim_", y, ".png"), plot = p)
  
  p
}

#saveGIF(c(1961, 1971, 1981, 1991, 2001, 2011) %>% map(get_scatter), 
#        interval = .2,
#        movie.name = "megacities_evolution.gif")

scatter_plots <- c(1961, 1971, 1981, 1991, 2001, 2011) %>% map(draw_scatter)

shell("magick convert -delay 100 -loop 1 images/anim*.png images/megacities_evolution.gif")

Megacities{width = 200px}